Author

|

Published

November 1, 2024

Code
load('data/special_biom.RData')
load('data/types_biom.RData')

Required Variables and Datasets for Research Question 01

Code
dataset_names <- c("AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", 
                   "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bbi", 
                   "AHSnhs11bbi", "AHSnhs11bbi", "AHSnhs11bbi", "AHSnhs11bbi", 
                   "AHSnhs11bbi", "AHSnhs11bmd", "AHSnhs11bmd", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp")

variable_names <- c("COBCODBC", "YOABC", "AGEB", "BMISC", "SYSTOL", "DIASTOL", 
                    "SEX", "CHOLRESB", "HDLCHREB", "LDLRESB", "CHOLNTR", 
                    "HDLCHSEX", "DIABPRVE", "ATCCURFB", "INSFLAG", "CARDASP", "DIABMQ01", "DIETQ8", "DIETQ5", "DIETQ12", "DIETQ14", "SALTACI", "SALTATI", "MILKFATU", "DIETRDI", "DIETQ1")

variable_descriptions <- c("Country of birth", "Year of arrival in Australia", "Age of person", 
                           "Body Mass Index (BMI) - score measured", "Systolic Blood Pressure (mmHg)", 
                           "Diastolic Blood Pressure (mmHg)", "Sex of person", 
                           "Total cholesterol - ranged (mmol/L)", "HDL cholesterol - ranged (mmol/L)", 
                           "Fasting LDL cholesterol - ranged (mmol/L)", "Total cholesterol status (mmol/L)", 
                           "HDL cholesterol status - sex dependent (mmol/L)", 
                           "Diabetes prevalence - HbA1c (%)", "Type of medication taken in last 2 weeks (code)", 
                           "Whether insulin used daily", "Whether uses aspirin daily for heart or circulatory condition" , "Currently having insulin every day", "Usual daily serves of fruit", "Usual daily serves of vegetables", "How often salt is used in household for cooking or preparing food", "How often salt is added to food at table", "Whether salt used in cooking or preparing food is iodised", "Whether salt added to meal at table is iodised" ,"Fat content of main type of milk usually consumed", "Whether vegetable and fruit consumption met recommended guidelines", "Main type of milk usually consumed")

variable_df1 <- data.frame(Dataset = dataset_names, Variable_Name = variable_names, Variable_Description = variable_descriptions)
variable_df1

Required Variables and Datasets for Research Question 02

Code
dataset_names <- c(
  "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", 
  "AHSnhs11bbi", 
  "AHSnhs11bmd", "AHSnhs11bmd", 
  "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", 
  "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp",
  "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", 
"AHSnhs11bsp"
)
variable_names <- c(
  "COBCODBC", "YOABC", "AGEB", "BMISC", "SYSTOL", "DIASTOL", "SEX", "DIABPRVE", 
  "ATCCURFB", "INSFLAG", "CARDASP", "SMKSTAT", "ALCUSUQ2", "TOTPAC", "ALCSTR01", 
 "ALCWKLY", "PHDKGW2", "ALINTWK", "EXREGUIC", "EXREGUID", "EXFSRMNE", 
  "EXLWMBC", "EXLWVBC",
  "DIETRDI"
)

variable_descriptions <- c(
  "Country of birth", "Year of arrival in Australia", "Age of person", "Body Mass Index (BMI) - score measured", 
  "Systolic Blood Pressure (mmHg)", "Diastolic Blood Pressure (mmHg)", "Sex of person", "Diabetes prevalence - HbA1c (%)", 
  "Type of medication taken in last 2 weeks (code)", "Whether insulin used daily", "Whether uses aspirin daily for heart or circulatory condition", 
  "Smoker status", "Frequency of alcohol consumption in the last 12 months", "Mls of pure alcohol consumed", "Short term alcohol risk (2001 Guidelines)", "Estimated total weekly consumption (in mls)", "Measured weight (kg)", "Average daily intake over week (in mls)", 
  "Whether exercise last week met 150 minutes recommended guidelines", "Whether exercise last week met 150 minutes and 5 sessions recommended guidelines", 
  "Total minutes walked for fitness, recreation or sport in last week", "Total minutes undertaken moderate exercise in last week", 
  "Total minutes undertaken vigorous exercise last week", 
  "Whether vegetable and fruit consumption met recommended guidelines"
)

variable_df2 <- data.frame(Dataset = dataset_names, Variable_Name = variable_names, Variable_Description = variable_descriptions)
variable_df2

Confounding Variables

The following represents the confounding medication variables that we will need to look at further. All individuals who are taking the following medication will be excluded from the analysis

  • CARDASP: Whether uses aspirin daily for heart or circulatory condition (medication for hypertension)
  1. Not applicable
  2. Aspirin used daily for heart or circulatory condition
  3. Aspirin not used daily for heart or circulatory condition
  4. Don’t know if aspirin used daily for heart or circulatory condition
  5. Not stated if aspirin used daily for heart or circulatory condition
  • DIABMQ01: Currently having insulin every day
  1. Not applicable
  2. Currently having insulin every day
  3. Not currently having insulin every day
  4. Don’t know if currently having insulin every day
  • ATCCURFB: Medication B01 17430. ANTITHROMBOTIC AGENTS B06 20190. OTHER HEMATOLOGICAL AGENTS C01 20370. CARDIAC THERAPY C02 21950. ANTIHYPERTENSIVES C03 23030. DIURETICS C04 24020. PERIPHERAL VASODILATORS C05 24460. VASOPROTECTIVES C07 25090. BETA BLOCKING AGENTS C08 25910. CALCIUM CHANNEL BLOCKERS C09 26300. AGENTS ACTING ON THE RENIN-ANGIOTENSIN SYSTEM C10 27090. LIPID MODIFYING AGENTS
Code
raw_AHSnhs11bsp <- read_csv("data/AHSnhs11bsp.csv")
raw_AHSnhs11bbi <- read_csv("data/AHSnhs11bbi.csv")
raw_AHSnhs11bmd <- read_csv("data/AHSnhs11bmd.csv")

raw_AHSnhs11bsp$ID <- paste(raw_AHSnhs11bsp$ABSLID, raw_AHSnhs11bsp$ABSPID, sep = "-")
raw_AHSnhs11bbi$ID <- paste(raw_AHSnhs11bbi$ABSLID, raw_AHSnhs11bbi$ABSPID, sep = "-")
raw_AHSnhs11bmd$ID <- paste(raw_AHSnhs11bmd$ABSLID, raw_AHSnhs11bmd$ABSPID, sep = "-")

Variable Selection

We perform separate variable filtering for each question in order to retain as many observations as possible during the cleaning process.

Research Question 1

The following code gets pulls the variables as defined in the Data Import section.

Code
df1 <- raw_AHSnhs11bsp[, c('ID', variable_df1 |> filter(Dataset == 'AHSnhs11bsp') |> pull(Variable_Name))]
df2<- raw_AHSnhs11bbi[, c('ID', variable_df1 |> filter(Dataset == 'AHSnhs11bbi') |> pull(Variable_Name))]
df3<- raw_AHSnhs11bmd[, c('ID', variable_df1 |> filter(Dataset == 'AHSnhs11bmd') |> pull(Variable_Name))]
df_q1 <- df1 |> 
  full_join(df2, by = "ID") |> 
  full_join(df3, by = "ID")

df_q1 <- as.data.frame(df_q1)

Research Question 2

Code
df1 <- raw_AHSnhs11bsp[, c('ID', variable_df2 |> filter(Dataset == 'AHSnhs11bsp') |> pull(Variable_Name))]
df2<- raw_AHSnhs11bbi[, c('ID', variable_df2 |> filter(Dataset == 'AHSnhs11bbi') |> pull(Variable_Name))]
df3<- raw_AHSnhs11bmd[, c('ID', variable_df2 |> filter(Dataset == 'AHSnhs11bmd') |> pull(Variable_Name))]
df_q2 <- df1 |> 
  full_join(df2, by = "ID") |> 
  full_join(df3, by = "ID")

df_q2 <- as.data.frame(df_q2)

Variable Type

The following lines of code converts the variables to the correct type.

Question 01

Code
for (i in 1:nrow(types_biom)) {
  var_name <- types_biom$variable_name[i]
  var_type <- types_biom$variable_type[i]
  
  if (var_name %in% colnames(df_q1)) {
    if (var_type == 'string') {
      df_q1[[var_name]] <- as.character(df_q1[[var_name]])
    } else if (var_type == 'numeric') {
      df_q1[[var_name]] <- as.numeric(df_q1[[var_name]])
    } else if (var_type == 'categorical') {
      df_q1[[var_name]] <- as.factor(df_q1[[var_name]])
    }
  }
}
Code
df_q1[, c("COBCODBC", "YOABC", "CARDASP", "HDLCHSEX", "DIABPRVE", "INSFLAG")] <- lapply(df_q1[, c("COBCODBC", "YOABC", "CARDASP", "HDLCHSEX", "DIABPRVE", "INSFLAG")], as.factor)

Question 02

Code
for (i in 1:nrow(types_biom)) {
  var_name <- types_biom$variable_name[i]
  var_type <- types_biom$variable_type[i]
  
  if (var_name %in% colnames(df_q2)) {
    if (var_type == 'string') {
      df_q2[[var_name]] <- as.character(df_q2[[var_name]])
    } else if (var_type == 'numeric') {
      df_q2[[var_name]] <- as.numeric(df_q2[[var_name]])
    } else if (var_type == 'categorical') {
      df_q2[[var_name]] <- as.factor(df_q2[[var_name]])
    }
  }
}
Code
df_q2[, c("COBCODBC", "YOABC", "CARDASP", "SMKSTAT", "ALCUSUQ2", "ALCSTR01", "EXREGUIC", "EXREGUID", "DIETRDI", "DIABPRVE", "ATCCURFB", "INSFLAG")] <- lapply(df_q2[, c("COBCODBC", "YOABC", "CARDASP", "SMKSTAT", "ALCUSUQ2", "ALCSTR01", "EXREGUIC", "EXREGUID", "DIETRDI", "DIABPRVE", "ATCCURFB", "INSFLAG")], as.factor)

Missing Values

The following code deals with implicit missing values

Code
miss_defs <- c("measurement not taken - equipment faulty",
               "measurement not taken - other reason",
               "measurement not taken - refusal",
               "not collected",
               "not applicable",
               "not determined",
               "not known",
               "not known if currently on a diet",
               "not measured",
               "not reported",
               "not stated",
               "not used",
               "not taken",
               "not recorded",
               "no answer",
               "not stated",
               "valid reading not obtained",
               "not obtained")


raw_to_tech <- function(proc, special, types)
{
  var_names <- colnames(proc)
  for (j in 1:length(var_names)) 
  {
    var_val <- var_names[j]
    specials <- special %>%
      filter(variable_name==var_val)
    if (nrow(specials)>0) 
    {
      ind <- which(var_names==var_val)
      var_miss_str  <- paste0(var_val,"_MISS")
      var_miss_reas <- rep("observed",nrow(proc))
      var_vals      <- proc[,ind]
      var_type <- types %>% 
        filter(variable_name==var_val) %>%
        select(variable_type) %>%
        as.character()
      if (var_type=="numeric") {
        for (i in 1:length(var_vals)) {
          specials <- specials %>% filter(str_detect(meaning, paste(miss_defs, collapse = "|")))
          if (var_vals[i] %in% specials$value) {
            ind2 <- which(var_vals[i]==specials$value)
            var_vals[i]      <- NA
            var_miss_reas[i] <- specials[ind2,3] %>% as.character()
          }
        }
      }
      if (var_type=="categorical") {
        for (i in 1:length(var_vals)) {
          spec_val  <- specials$value
          spec_meam <- specials$meaning
          if (var_vals[i] %in% spec_val) 
          {
            var_mean <- spec_meam[var_vals[i] == spec_val]
            if (var_mean %in% miss_defs) {
              var_vals[i]      <- NA
              var_miss_reas[i] <- var_mean
            }
          } else {
            var_vals[i]      <- NA
            var_miss_reas[i] <- "unknown"
          }
        }
      }
      if (any(is.na(var_vals))) {
        proc[,ind] <- var_vals
        proc$dummy <- var_miss_reas
        colnames(proc)[ncol(proc)] <- var_miss_str
      }
    }
  }
  return(proc)
}


tech_q1 <- raw_to_tech(df_q1, special_biom, types_biom)
tech_q2 <- raw_to_tech(df_q2, special_biom, types_biom)

The following code is to further clean any and all values above 9995 and assign them NA values.

Code
update_large_values <- function(data, threshold = 9995, reason = "Missing due to unknown reasons") {
  numeric_cols <- colnames(data)[sapply(data, is.numeric)]
  
  for (col in numeric_cols) {
    # Set specific thresholds for columns
    col_threshold <- if (col == "ATCCURFB") {
      99995
    } else if (col == "PHDKGW2") {
      995
    } else {
      threshold
    }
    
    miss_column <- paste0(col, "_MISS")
    
    if (!miss_column %in% colnames(data)) {
      data[[miss_column]] <- "observed"
    }
    
    # Update values based on the threshold
    data[[miss_column]][data[[col]] > col_threshold] <- reason
    data[[col]] <- ifelse(data[[col]] > col_threshold, NA, data[[col]])

    # Special case for PHDKGW2: change 0 values to NA
    if (col == "PHDKGW2") {
      data[[col]] <- ifelse(data[[col]] == 0, NA, data[[col]])
    }
  }
  
  return(data)
}

tech_q1 <- update_large_values(tech_q1)
tech_q2 <- update_large_values(tech_q2)

Visualising Missing Values

Code
g <- tech_q2 %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram() + theme_bw() + ggtitle('Histogram of Numeric Variables')
g

Data looks technically correct now, without substantial or significant missing values that stand out.

Preparation

The following are the given categorical bins for LDL and HDL variables.

HDLCHREB HDL cholesterol - ranged (mmol/L) 1. Less than 1.0 2. 1.0 to less than 1.3 3. 1.3 to less than 1.5 4. 1.5 to less than 2.0 5. 2.0 to less than 2.5 6. 2.5 or more 7. Not applicable 8. Not reported

LDLRESB Fasting LDL cholesterol - ranged (mmol/L) 01. Less than 1.5 02. 1.5 to less than 2.0 03. 2.0 to less than 2.5 04. 2.5 to less than 3.0 05. 3.0 to less than 3.5 06. 3.5 to less than 4.0 07. 4.0 to less than 4.5 08. 4.5 or more 97. Not applicable 98. Not reported

Using this, we will convert this categorical and right censored data to numeric with the following code.

HDL cholesterol (HDLCHREB)

The following code uses the fitdistrplus to perform conversion from categorical to numerical given the bins and sample sizes of each gender and measure found from the published data dictionary. Details on the method can be found in the paper.

Code
library(fitdistrplus)

expected_value_interval_norm <- function(mean, sd, lower, upper) {
  if (is.infinite(upper)) {
    upper <- mean + 10 * sd  
  }
  numerator <- integrate(function(x) x * dnorm(x, mean, sd), lower, upper)$value
  denominator <- pnorm(upper, mean, sd) - pnorm(lower, mean, sd)
  expected_value <- numerator / denominator
  return(expected_value)
}

hdl_male <- data.frame(
  Range = c("<1.0", "1.0-1.3", "1.3-1.5", "1.5-2.0", "2.0-2.5", "≥2.5"),
  Lower = c(0, 1.0, 1.3, 1.5, 2.0, 2.5),
  Upper = c(1.0, 1.3, 1.5, 2.0, 2.5, 3.5),
  Count = c(1591.6, 3468.8, 1774.9, 1327.0, 125.4, 22.5)
)
hdl_male$Count <- round(hdl_male$Count * 1000)

hdl_male_cens <- data.frame(
  left = hdl_male$Lower,
  right = hdl_male$Upper,
  count = hdl_male$Count
)
hdl_male_cens_expanded <- hdl_male_cens[rep(1:nrow(hdl_male_cens), hdl_male_cens$count), 1:2]
fit_hdl_male <- fitdistcens(hdl_male_cens_expanded, "norm")
hdl_male$ExpectedValue <- mapply(
  expected_value_interval_norm,
  mean = fit_hdl_male$estimate["mean"],
  sd = fit_hdl_male$estimate["sd"],
  lower = hdl_male$Lower,
  upper = hdl_male$Upper
)

hdl_female <- data.frame(
  Range = c("<1.0", "1.0-1.3", "1.3-1.5", "1.5-2.0", "2.0-2.5", "≥2.5"),
  Lower = c(0, 1.0, 1.3, 1.5, 2.0, 2.5),
  Upper = c(1.0, 1.3, 1.5, 2.0, 2.5, 3.5),
  Count = c(411.3, 1938.4, 1970.4, 3304.5, 814.3, 72.2)
)
hdl_female$Count <- round(hdl_female$Count * 1000)

hdl_female_cens <- data.frame(
  left = hdl_female$Lower,
  right = hdl_female$Upper,
  count = hdl_female$Count
)
hdl_female_cens_expanded <- hdl_female_cens[rep(1:nrow(hdl_female_cens), hdl_female_cens$count), 1:2]
fit_hdl_female <- fitdistcens(hdl_female_cens_expanded, "norm")
hdl_female$ExpectedValue <- mapply(
  expected_value_interval_norm,
  mean = fit_hdl_female$estimate["mean"],
  sd = fit_hdl_female$estimate["sd"],
  lower = hdl_female$Lower,
  upper = hdl_female$Upper
)

hdl_male$HDL_Category = c(1:6)
hdl_female$HDL_Category = c(1:6)

The following code plots the empirical vs fitted male HDL distribution.

Code
library(ggplot2)

hdl_male$Proportion <- hdl_male$Count / sum(hdl_male$Count)
hdl_female$Proportion <- hdl_female$Count / sum(hdl_female$Count)
max_proportion_female <- max(hdl_female$Proportion)
max_proportion_male <- max(hdl_male$Proportion)

ggplot(hdl_male, aes(x = ExpectedValue, y = Proportion)) +
  geom_bar(stat = "identity", fill = "lightblue", alpha = 0.6) +
  stat_function(fun = function(x) dnorm(x, mean = fit_hdl_male$estimate["mean"], sd = fit_hdl_male$estimate["sd"]) *
                   max_proportion_male, color = "blue", size = 1) +
  labs(title = "Empirical vs. Fitted Proportion Distribution of HDL in Males",
       x = "HDL Level", y = "Proportion") +
  theme_bw()

Code
ggplot(hdl_female, aes(x = ExpectedValue, y = Proportion)) +
  geom_bar(stat = "identity", fill = "pink", alpha = 0.6) +
  stat_function(fun = function(x) dnorm(x, mean = fit_hdl_female$estimate["mean"], sd = fit_hdl_female$estimate["sd"]) *
                   max_proportion_female, color = "red", size = 1) +
  labs(title = "Empirical vs. Fitted Proportion Distribution of HDL in Females",
       x = "HDL Level", y = "Proportion") +
  theme_bw()

Code
hdl_combined <- rbind(
  transform(hdl_male, Sex = "Male"),
  transform(hdl_female, Sex = "Female")
)
ggplot(hdl_combined, aes(x = HDL_Category, y = ExpectedValue, color = Sex, group = Sex)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  labs(title = "Expected HDL Levels by Category for Males and Females",
       x = "HDL Category", y = "Expected HDL Level") +
  theme_minimal() +
  scale_x_continuous(breaks = 1:6, labels = hdl_male$Range) +
  theme(legend.position = "top")

The following code saves the data.

Code
hdl_male_lookup <- data.frame(
  HDLCHREB = 1:6,
  HDL_Expected = hdl_male$ExpectedValue
)

hdl_female_lookup <- data.frame(
  HDLCHREB = 1:6,
  HDL_Expected = hdl_female$ExpectedValue
)

tech_q1$HDLCHREB <- as.numeric(tech_q1$HDLCHREB)

tech_q1$SEX <- as.character(tech_q1$SEX)

get_hdl_expected <- function(sex, hdlchreb) {
  if (sex == 2) {
    return(hdl_female_lookup$HDL_Expected[match(hdlchreb, hdl_female_lookup$HDLCHREB)])
  } else if (sex == 1) {
    return(hdl_male_lookup$HDL_Expected[match(hdlchreb, hdl_male_lookup$HDLCHREB)])
  } else {
    return(NA)  # Handle cases where SEX is missing or unspecified
  }
}

tech_q1$HDL_Expected <- mapply(get_hdl_expected, tech_q1$SEX, tech_q1$HDLCHREB)

LDL cholesterol (LDLRESB)

Follows the same description as previous.

Code
library(fitdistrplus)

expected_value_interval_norm <- function(mean, sd, lower, upper) {
  if (is.infinite(upper)) {
    upper <- mean + 10 * sd  
  }
  numerator <- integrate(function(x) x * dnorm(x, mean, sd), lower, upper)$value
  denominator <- pnorm(upper, mean, sd) - pnorm(lower, mean, sd)
  expected_value <- numerator / denominator
  return(expected_value)
}

ldl_male <- data.frame(
  Range = c(
    "<1.5",
    "1.5-2.0",
    "2.0-2.5",
    "2.5-3.0",
    "3.0-3.5",
    "3.5-4.0",
    "4.0-4.5",
    ">=4.5"
  ),
  Lower = c(0.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5),
  Upper = c(1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 7),
  Count = c(90.0, 426.5, 851.6, 1395.7, 1447.9, 1129.9, 580.1, 621.8)
)



ldl_male$Count <- round(ldl_male$Count * 1000)

ldl_male_cens <- data.frame(
  left = ldl_male$Lower,
  right = ldl_male$Upper,
  count = ldl_male$Count
)
ldl_male_cens_expanded <- ldl_male_cens[rep(1:nrow(ldl_male_cens), ldl_male_cens$count), 1:2]
fit_ldl_male <- fitdistcens(ldl_male_cens_expanded, "norm")
ldl_male$ExpectedValue <- mapply(
  expected_value_interval_norm,
  mean = fit_ldl_male$estimate["mean"],
  sd = fit_ldl_male$estimate["sd"],
  lower = ldl_male$Lower,
  upper = ldl_male$Upper
)

ldl_female <- data.frame(
  Range = c(
    "<1.5",
    "1.5-2.0",
    "2.0-2.5",
    "2.5-3.0",
    "3.0-3.5",
    "3.5-4.0",
    "4.0-4.5",
    ">=4.5"
  ),
  Lower = c(0.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5),
  Upper = c(1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 7.0),
  Count = c(92.3, 442.7, 1094.5, 1665.1, 1379.4, 1116.7, 546.5, 501.1)
)



ldl_female$Count <- round(ldl_female$Count * 1000)

ldl_female_cens <- data.frame(
  left = ldl_female$Lower,
  right = ldl_female$Upper,
  count = ldl_female$Count
)
ldl_female_cens_expanded <- ldl_female_cens[rep(1:nrow(ldl_female_cens), ldl_female_cens$count), 1:2]
fit_ldl_female <- fitdistcens(ldl_female_cens_expanded, "norm")
ldl_female$ExpectedValue <- mapply(
  expected_value_interval_norm,
  mean = fit_ldl_female$estimate["mean"],
  sd = fit_ldl_female$estimate["sd"],
  lower = ldl_female$Lower,
  upper = ldl_female$Upper
)

ldl_male$LDL_Category = c(1:8)
ldl_female$LDL_Category = c(1:8)
Code
ldl_male_lookup <- data.frame(
  LDLRESB = 1:8,
  LDL_Expected = ldl_male$ExpectedValue
)

ldl_female_lookup <- data.frame(
  LDLRESB = 1:8,
  LDL_Expected = ldl_female$ExpectedValue
)

tech_q1$LDLRESB <- as.numeric(tech_q1$LDLRESB)

tech_q1$SEX <- as.character(tech_q1$SEX)

get_ldl_expected <- function(sex, LDLRESB) {
  if (sex == 2) {
    return(ldl_female_lookup$LDL_Expected[match(LDLRESB, ldl_female_lookup$LDLRESB)])
  } else if (sex == 1) {
    return(ldl_male_lookup$LDL_Expected[match(LDLRESB, ldl_male_lookup$LDLRESB)])
  } else {
    return(NA)  # Handle cases where SEX is missing or unspecified
  }
}

tech_q1$LDL_Expected <- mapply(get_ldl_expected, tech_q1$SEX, tech_q1$LDLRESB)
Code
ldl_male$Proportion <- ldl_male$Count / sum(ldl_male$Count)
ldl_female$Proportion <- ldl_female$Count / sum(ldl_female$Count)

max_proportion_male <- max(ldl_male$Proportion)
max_proportion_female <- max(ldl_female$Proportion)

max_density_male <- dnorm(fit_ldl_male$estimate["mean"], mean = fit_ldl_male$estimate["mean"], sd = fit_ldl_male$estimate["sd"])
max_density_female <- dnorm(fit_ldl_female$estimate["mean"], mean = fit_ldl_female$estimate["mean"], sd = fit_ldl_female$estimate["sd"])

scale_factor_male <- max_proportion_male / max_density_male
scale_factor_female <- max_proportion_female / max_density_female

ggplot(ldl_male, aes(x = ExpectedValue, y = Proportion)) +
  geom_bar(stat = "identity", fill = "lightblue", alpha = 0.6) +
  stat_function(fun = function(x) dnorm(x, mean = fit_ldl_male$estimate["mean"], sd = fit_ldl_male$estimate["sd"]) *
                   scale_factor_male, color = "blue", size = 1) +
  labs(title = "Empirical vs. Fitted Proportion Distribution of LDL in Males",
       x = "LDL Level", y = "Proportion") +
  theme_minimal()

Code
ggplot(ldl_female, aes(x = ExpectedValue, y = Proportion)) +
  geom_bar(stat = "identity", fill = "pink", alpha = 0.6) +
  stat_function(fun = function(x) dnorm(x, mean = fit_ldl_female$estimate["mean"], sd = fit_ldl_female$estimate["sd"]) *
                   scale_factor_female, color = "red", size = 1) +
  labs(title = "Empirical vs. Fitted Proportion Distribution of LDL in Females",
       x = "LDL Level", y = "Proportion") +
  theme_minimal()

Code
ldl_combined <- rbind(
  transform(ldl_male, Sex = "Male"),
  transform(ldl_female, Sex = "Female")
)

ggplot(ldl_combined, aes(x = LDL_Category, y = ExpectedValue, color = Sex, group = Sex)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  labs(title = "Expected LDL Levels by Category for Males and Females",
       x = "LDL Category", y = "Expected LDL Level") +
  theme_minimal() +
  scale_x_continuous(breaks = 1:8, labels = ldl_male$Range) +
  theme(legend.position = "top")

Excluding people with medication

Code
non_medication_tech_q1 <- tech_q1 %>%
  filter(CARDASP != 1,                
         DIABMQ01 != 1,               
         !ATCCURFB %in% c(17430, 20190, 20370, 21950, 23030, 24020, 24460, 25090, 25910, 26300, 27090))  

non_medication_tech_q1

LDL-HDL Ratio

Code
non_medication_tech_q1$LDL_HDL_ratio <- non_medication_tech_q1$LDL_Expected / non_medication_tech_q1$HDL_Expected

Selecting only required variables

Code
final_q1 <- non_medication_tech_q1 |> dplyr::select(LDL_HDL_ratio, ID, COBCODBC, YOABC, AGEB, SYSTOL, DIASTOL, LDL_Expected, DIETQ8, DIETQ5, SALTATI, SALTACI, MILKFATU, DIETRDI, DIETQ1)
final_q1 <- na.omit(final_q1) |> unique() 
variable_df1 |> filter(Variable_Name %in%colnames(final_q1)) 
Code
table(final_q1$COBCODBC)

   1    2    3 
2891  506  574 
Code
table(final_q1$YOABC)

   1    2    3    4    5    6    7 
2891  536   86   66   89  123  180 

Filtering more columns of unknown responses

Code
final_q1 <- final_q1 |> filter(DIETQ5 != 0) |> filter(SALTATI %in% c(1,2,3)) |> filter(SALTACI %in% c(1,2,3))|> filter(! MILKFATU %in% c(0,4))|> filter(! DIETQ1 %in% c(0,6))|> filter(! DIETRDI %in% c(0,3)) |> filter(! DIETQ1  %in% c(3,4)) 

Regrouping Fruit and Vegetables column

Code
final_q1 <- final_q1 %>%
  mutate(DIETQ8 = case_when(
    DIETQ8 %in% c(7, 8) ~ 0,  
    TRUE ~ DIETQ8  
  ))


final_q1 <- final_q1 %>%
  mutate(DIETQ5 = case_when(
    DIETQ5 %in% c(7, 8) ~ 0,  
    TRUE ~ DIETQ5  
  ))

IDA

SALTATI

Code
# Define labels for COBCODBC
labels <- c("Australians", "Main English Speaking Countries", "Other")

# Create the plot with relabeled legend
ggplot(final_q1, aes(x = SALTATI, y = LDL_Expected, color = COBCODBC, group = COBCODBC)) +
  stat_summary(fun = "mean", geom = "point", size = 3) +  # Points for means
  stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC)) +  # Line connecting means
  labs(title = "Interaction Plot: SALTATI and Absolute LDL by Country of Birth",
       x = "SALTATI", 
       y = "Absolute LDL (mmol/L)",
       color = "Country of Birth") +  # Change legend title
  scale_color_manual(values = c("lightblue3", "tomato3", "palegreen4"), labels = labels) +
  theme_bw()

Code
# Define labels for COBCODBC
labels <- c("Australians", "Main English Speaking Countries", "Other")

# Create the plot with relabeled legend
ggplot(final_q1, aes(x = SALTATI, y = LDL_HDL_ratio, color = COBCODBC, group = COBCODBC)) +
  stat_summary(fun = "mean", geom = "point", size = 3) +  # Points for means
  stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC), size = 1) +  # Line connecting means
  labs(title = "Interaction Plot: SALTATI and LDL/HDL Ratio by Country of Birth",
       x = "SALTATI", 
       y = "LDL/HDL Ratio",
       color = "Country of Birth") +  # Change legend title
  scale_color_manual(values = c("lightblue3", "tomato3", "palegreen4"), labels = labels) +
  theme_bw()

SALTACI

Code
table(final_q1$SALTACI , final_q1$COBCODBC )
   
       1    2    3
  1  750  160  257
  2  450   74  159
  3 1443  238   86
Code
final_q1$SALTACI <- as.factor(final_q1$SALTACI)
ggplot(final_q1, aes(x = SALTACI, y = LDL_HDL_ratio, color = COBCODBC, group = COBCODBC)) +
  # geom_boxplot(position = position_dodge(width = 0.75)) +  # Boxplot for each level of DIETQ5
  stat_summary(fun = "mean", geom = "point", size = 3) +  # Points for means
  stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC)) +  # Line connecting means
  labs(title = "Interaction Plot: Categorical DIETQ5 and LDL/HDL Ratio by Country of Birth",
       x = "SALTACI", 
       y = "LDL/HDL Ratio") +
  theme_bw()

MILKFATU

Code
final_q1$MILKFATU <- as.factor(final_q1$MILKFATU)
ggplot(final_q1, aes(x = MILKFATU, y = LDL_HDL_ratio, color = COBCODBC, group = COBCODBC)) +
  # geom_boxplot(position = position_dodge(width = 0.75)) +  # Boxplot for each level of DIETQ5
  stat_summary(fun = "mean", geom = "point", size = 3) +  # Points for means
  stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC)) +  # Line connecting means
  labs(title = "Interaction Plot: Categorical DIETQ5 and LDL/HDL Ratio by Country of Birth",
       x = "MILKFATU", 
       y = "LDL/HDL Ratio") +
  theme_bw()

Code
table(final_q1$MILKFATU,final_q1$COBCODBC )
   
       1    2    3
  1  993  160  241
  2 1070  197  159
  3  451   97   78
  5  129   18   24

DIETRDI

Code
final_q1$DIETRDI <- as.factor(final_q1$DIETRDI)
ggplot(final_q1, aes(x = DIETRDI, y = LDL_HDL_ratio, color = COBCODBC, group = COBCODBC)) +
  # geom_boxplot(position = position_dodge(width = 0.75)) +  # Boxplot for each level of DIETQ5
  stat_summary(fun = "mean", geom = "point", size = 3) +  # Points for means
  stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC)) +  # Line connecting means
  labs(title = "Interaction Plot: Categorical DIETQ5 and LDL/HDL Ratio by Country of Birth",
       x = "DIETRDI", 
       y = "LDL/HDL Ratio") +
  theme_bw()

Code
table(final_q1$MILKFATU,final_q1$COBCODBC )
   
       1    2    3
  1  993  160  241
  2 1070  197  159
  3  451   97   78
  5  129   18   24

DIETQ1

Code
final_q1$DIETQ1 <- as.factor(final_q1$DIETQ1)
ggplot(final_q1, aes(x = DIETQ1, y = LDL_HDL_ratio, color = COBCODBC, group = COBCODBC)) +
  # geom_boxplot(position = position_dodge(width = 0.75)) +  # Boxplot for each level of DIETQ5
  stat_summary(fun = "mean", geom = "point", size = 3) +  # Points for means
  stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC)) +  # Line connecting means
  labs(title = "Interaction Plot: Categorical DIETQ5 and LDL/HDL Ratio by Country of Birth",
       x = "DIETQ1", 
       y = "LDL/HDL Ratio") +
  theme_bw()

DIETQ5

Code
ggplot(final_q1, aes(x = DIETQ5, y = LDL_HDL_ratio, color = COBCODBC)) +
  geom_point()+
  geom_smooth(method = "lm", se = FALSE) + facet_wrap(~ COBCODBC, scales = "free_y") +
  labs(title = "LDL/HDL Ratio against DIETQ5 and COBCODBC", x = "DIETQ5",y = "LDL/HDL Ratio") +
  theme_bw()

DIETQ8

Code
labels <- c("Australians", "Main English Speaking Countries", "Other")
ggplot(final_q1, aes(x = DIETQ8, y = LDL_Expected, color = COBCODBC)) +
  geom_point(position = 'jitter', alpha = 0.2) +
  geom_smooth(method = "lm", se = FALSE) +  # Add regression lines for each group
  labs(title = "Interaction Between Fruit Intake (DIETQ8) and Absolute LDL by Country of Birth",
       x = "Fruit Intake (DIETQ8)",
       y = "Absolute LDL") + facet_wrap(~ COBCODBC, scales = "free_y") +
  theme_bw()+  # Change legend title
  scale_color_manual(values = c("lightblue4", "tomato3", "palegreen4"), labels = labels) +
  theme_bw()

Code
labels <- c("Australians", "Main English Speaking Countries", "Other")
ggplot(final_q1, aes(x = DIETQ8, y = LDL_HDL_ratio, color = COBCODBC)) +
  geom_point(position = 'jitter', alpha = 0.2) +
  geom_smooth(method = "lm", se = FALSE) +  # Add regression lines for each group
  labs(title = "Interaction Between Fruit Intake (DIETQ8) and LDL/HDL Ratio by Country of Birth",
       x = "Fruit Intake (DIETQ8)",
       y = "LDL/HDL Ratio") + facet_wrap(~ COBCODBC, scales = "free_y") +
  theme_bw()+  # Change legend title
  scale_color_manual(values = c("lightblue4", "tomato3", "palegreen4"), labels = labels) +
  theme_bw()

DIETQ5 and DIETQ8

Code
# Heatmap showing the interaction between DIETQ5 and DIETQ8 on LDL/HDL ratio
ggplot(final_q1, aes(x = DIETQ5, y = DIETQ8, fill = LDL_HDL_ratio)) +
  geom_tile() +
  facet_wrap(~COBCODBC) +  # Separate plots for each country
  labs(title = "Heatmap of LDL/HDL Ratio Based on Vegetable and Fruit Intake",
       x = "Vegetable Intake (DIETQ5)",
       y = "Fruit Intake (DIETQ8)",
       fill = "LDL/HDL Ratio") +
  theme_bw()

Linear Mixed Effects Model

All Interactions

Code
final_q1$DIETQ8 <- as.numeric(final_q1$DIETQ8 )
final_q1$DIETQ5 <- as.numeric(final_q1$DIETQ5 )
final_q1$DIETQ1 <- as.factor(final_q1$DIETQ1 )
final_q1$SALTATI <- as.factor(final_q1$SALTATI )
final_q1$SALTACI <- as.factor(final_q1$SALTACI )
final_q1$MILKFATU <- as.factor(final_q1$MILKFATU )
Code
model_ldl_hdl <- lmer(
  LDL_Expected ~ +  
    (DIETQ5 + DIETQ8 + MILKFATU  + DIETQ1 ) * COBCODBC + 
    (1 | AGEB),  
  data = final_q1
)
summary(model_ldl_hdl)
Linear mixed model fit by REML ['lmerMod']
Formula: LDL_Expected ~ +(DIETQ5 + DIETQ8 + MILKFATU + DIETQ1) * COBCODBC +  
    (1 | AGEB)
   Data: final_q1

REML criterion at convergence: 8767.6

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.98604 -0.66729 -0.05657  0.63333  2.85462 

Random effects:
 Groups   Name        Variance Std.Dev.
 AGEB     (Intercept) 0.09681  0.3111  
 Residual             0.63989  0.7999  
Number of obs: 3617, groups:  AGEB, 17

Fixed effects:
                     Estimate Std. Error t value
(Intercept)          3.189186   0.086482  36.877
DIETQ5              -0.009637   0.012018  -0.802
DIETQ8              -0.008550   0.013639  -0.627
MILKFATU2           -0.060393   0.035610  -1.696
MILKFATU3           -0.127196   0.045862  -2.773
MILKFATU5           -0.009948   0.075388  -0.132
DIETQ12             -0.155029   0.069825  -2.220
COBCODBC2            0.108738   0.105756   1.028
COBCODBC3            0.057770   0.096541   0.598
DIETQ5:COBCODBC2     0.034987   0.031074   1.126
DIETQ5:COBCODBC3     0.003869   0.030519   0.127
DIETQ8:COBCODBC2    -0.110004   0.036098  -3.047
DIETQ8:COBCODBC3    -0.054198   0.035150  -1.542
MILKFATU2:COBCODBC2 -0.022444   0.092676  -0.242
MILKFATU3:COBCODBC2 -0.039699   0.113146  -0.351
MILKFATU5:COBCODBC2 -0.188127   0.214042  -0.879
MILKFATU2:COBCODBC3  0.033335   0.090315   0.369
MILKFATU3:COBCODBC3  0.070173   0.114892   0.611
MILKFATU5:COBCODBC3 -0.106912   0.187669  -0.570
DIETQ12:COBCODBC2    0.282540   0.189091   1.494
DIETQ12:COBCODBC3    0.242963   0.137189   1.771
fit warnings:
fixed-effect model matrix is rank deficient so dropping 3 columns / coefficients

Assumptions

Linearity

Code
plot(fitted(model_ldl_hdl), resid(model_ldl_hdl))
abline(h = 0, col = "red")

Normality

CLT -> Normality is okay

Code
qqnorm(resid(model_ldl_hdl))
qqline(resid(model_ldl_hdl), col = "red")

Random effect distribution

Code
ranef_vals <- ranef(model_ldl_hdl)$AGEB[[1]]
qqnorm(ranef_vals)
qqline(ranef_vals, col = "red")

Code
hist(ranef_vals, breaks = 20, main = "Random Effects for AGEB", xlab = "Random Effects")

Redo Analysis Using Absolute LDL

Code
model_ldl <- lmer(
  LDL_Expected ~ +  
    (DIETQ5 + DIETQ8 + MILKFATU + SALTATI + SALTACI  + DIETQ1) * COBCODBC + 
    (1 | AGEB),  
  data = final_q1
)


summary(model_ldl)
Linear mixed model fit by REML ['lmerMod']
Formula: LDL_Expected ~ +(DIETQ5 + DIETQ8 + MILKFATU + SALTATI + SALTACI +  
    DIETQ1) * COBCODBC + (1 | AGEB)
   Data: final_q1

REML criterion at convergence: 8789.1

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.0181 -0.6706 -0.0460  0.6365  2.8711 

Random effects:
 Groups   Name        Variance Std.Dev.
 AGEB     (Intercept) 0.09599  0.3098  
 Residual             0.63856  0.7991  
Number of obs: 3617, groups:  AGEB, 17

Fixed effects:
                      Estimate Std. Error t value
(Intercept)          3.2194366  0.0907878  35.461
DIETQ5              -0.0112486  0.0120278  -0.935
DIETQ8              -0.0071896  0.0136769  -0.526
MILKFATU2           -0.0523478  0.0358692  -1.459
MILKFATU3           -0.1142791  0.0465255  -2.456
MILKFATU5           -0.0004972  0.0755298  -0.007
SALTATI2             0.0422872  0.0511258   0.827
SALTATI3            -0.0273859  0.0385363  -0.711
SALTACI2            -0.0228724  0.0532994  -0.429
SALTACI3            -0.0427701  0.0379499  -1.127
DIETQ12             -0.1519029  0.0698199  -2.176
COBCODBC2           -0.1103915  0.1223605  -0.902
COBCODBC3            0.0157575  0.1255820   0.125
DIETQ5:COBCODBC2     0.0358454  0.0311736   1.150
DIETQ5:COBCODBC3     0.0053220  0.0306631   0.174
DIETQ8:COBCODBC2    -0.1216838  0.0362321  -3.358
DIETQ8:COBCODBC3    -0.0574787  0.0351771  -1.634
MILKFATU2:COBCODBC2 -0.0610342  0.0940963  -0.649
MILKFATU3:COBCODBC2 -0.0868113  0.1165373  -0.745
MILKFATU5:COBCODBC2 -0.2281050  0.2145183  -1.063
MILKFATU2:COBCODBC3  0.0286219  0.0904858   0.316
MILKFATU3:COBCODBC3  0.0697412  0.1162668   0.600
MILKFATU5:COBCODBC3 -0.1240031  0.1887486  -0.657
SALTATI2:COBCODBC2   0.1001062  0.1367455   0.732
SALTATI3:COBCODBC2   0.3067783  0.0991748   3.093
SALTATI2:COBCODBC3  -0.0055759  0.1586705  -0.035
SALTATI3:COBCODBC3   0.0480577  0.1048075   0.459
SALTACI2:COBCODBC2   0.0516511  0.1407191   0.367
SALTACI3:COBCODBC2   0.1514477  0.0971475   1.559
SALTACI2:COBCODBC3   0.0478579  0.1051472   0.455
SALTACI3:COBCODBC3  -0.0284917  0.1084511  -0.263
DIETQ12:COBCODBC2    0.2255369  0.1903993   1.185
DIETQ12:COBCODBC3    0.2406180  0.1373757   1.752
fit warnings:
fixed-effect model matrix is rank deficient so dropping 3 columns / coefficients

Code
data = tech_q2
data$SMKSTAT <- as.factor(data$SMKSTAT)
data$ALCUSUQ2 <- as.factor(data$ALCUSUQ2)
data$ALCSTR01 <- as.factor(data$ALCSTR01)
data$DIETRDI <- as.factor(data$DIETRDI)

str(data$ALCUSUQ2)
 Factor w/ 9 levels "0","1","2","3",..: 6 3 2 1 8 5 5 5 5 5 ...
Code
str(data$ALCSTR01)
 Factor w/ 9 levels "0","1","2","3",..: 4 2 2 1 6 4 4 4 4 4 ...
Code
data

Smoking

Code
data_clean1 <- data %>%
  filter(!is.na(SYSTOL) & !is.na(DIASTOL) & !is.na(SMKSTAT) & !is.na(COBCODBC)  & 
         !is.na(AGEB) & !is.na(SEX))
Code
# SYSTOL

data_clean1$SMKSTAT <- as.factor(data_clean1$SMKSTAT)
systol_model1 <- lmer(
  SYSTOL ~  (1 | AGEB) + (1 | SEX)  + SMKSTAT * COBCODBC, 
  data = data_clean1
)

# 
diastol_model1 <- lmer(
  DIASTOL ~  (1 | AGEB) + (1 | SEX)  + SMKSTAT * COBCODBC, 
  data = data_clean1
)
# Summaries
summary(systol_model1)
Linear mixed model fit by REML ['lmerMod']
Formula: SYSTOL ~ (1 | AGEB) + (1 | SEX) + SMKSTAT * COBCODBC
   Data: data_clean1

REML criterion at convergence: 323878.9

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.9270 -0.6633 -0.0834  0.5600  5.8065 

Random effects:
 Groups   Name        Variance Std.Dev.
 AGEB     (Intercept) 134.155  11.583  
 SEX      (Intercept)   6.109   2.472  
 Residual             328.402  18.122  
Number of obs: 37512, groups:  AGEB, 16; SEX, 2

Fixed effects:
                     Estimate Std. Error t value
(Intercept)        127.368674   3.395288  37.513
SMKSTAT2            -2.703954   1.133933  -2.385
SMKSTAT3             0.539468   1.810742   0.298
SMKSTAT4            -2.917129   0.346743  -8.413
SMKSTAT5            -1.831677   0.334580  -5.475
COBCODBC2            0.349355   0.766995   0.455
COBCODBC3            0.706628   0.772916   0.914
SMKSTAT2:COBCODBC2  -1.559212   2.915004  -0.535
SMKSTAT3:COBCODBC2   6.960866   5.391496   1.291
SMKSTAT4:COBCODBC2  -1.191154   0.875963  -1.360
SMKSTAT5:COBCODBC2  -2.272666   0.894549  -2.541
SMKSTAT2:COBCODBC3   1.027121   2.732123   0.376
SMKSTAT3:COBCODBC3   1.299832   5.807906   0.224
SMKSTAT4:COBCODBC3   1.135671   0.907714   1.251
SMKSTAT5:COBCODBC3   0.000531   0.853761   0.001
Code
summary(diastol_model1)
Linear mixed model fit by REML ['lmerMod']
Formula: DIASTOL ~ (1 | AGEB) + (1 | SEX) + SMKSTAT * COBCODBC
   Data: data_clean1

REML criterion at convergence: 285395.6

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.5817 -0.6651 -0.0382  0.6109  5.2710 

Random effects:
 Groups   Name        Variance Std.Dev.
 AGEB     (Intercept)  15.5184  3.9393 
 SEX      (Intercept)   0.1512  0.3888 
 Residual             117.7374 10.8507 
Number of obs: 37512, groups:  AGEB, 16; SEX, 2

Fixed effects:
                   Estimate Std. Error t value
(Intercept)         76.3083     1.0379  73.519
SMKSTAT2            -0.8670     0.6790  -1.277
SMKSTAT3            -0.5169     1.0842  -0.477
SMKSTAT4            -1.3119     0.2076  -6.319
SMKSTAT5            -1.0927     0.2003  -5.455
COBCODBC2           -0.1040     0.4592  -0.226
COBCODBC3            0.5782     0.4628   1.249
SMKSTAT2:COBCODBC2   1.3628     1.7454   0.781
SMKSTAT3:COBCODBC2   5.2133     3.2282   1.615
SMKSTAT4:COBCODBC2  -0.6894     0.5245  -1.314
SMKSTAT5:COBCODBC2  -1.1580     0.5356  -2.162
SMKSTAT2:COBCODBC3   4.9966     1.6359   3.054
SMKSTAT3:COBCODBC3   4.6543     3.4775   1.338
SMKSTAT4:COBCODBC3   0.7167     0.5435   1.319
SMKSTAT5:COBCODBC3   0.4795     0.5112   0.938
Code
data_clean2 <- data %>%
  filter(!is.na(SYSTOL) & !is.na(DIASTOL) & !is.na(COBCODBC)  & !is.na(TOTPAC)  & !is.na(ALCWKLY) &
         !is.na(AGEB) & !is.na(SEX) & !is.na(ALCUSUQ2) & !is.na(ALCSTR01)  & !is.na(PHDKGW2))


data_clean2$TOTPAC_normalise = data_clean2$TOTPAC/data_clean2$PHDKGW2
data_clean2$ALCWKLY_normalise = data_clean2$ALCWKLY/data_clean2$PHDKGW2
Code
# Define labels for COBCODBC
labels <- c("Australians", "Main English Speaking Countries", "Other")

# Create the plot with relabeled legend and x-axis labels for SMKSTAT
ggplot(data_clean2, aes(x = SMKSTAT, y = SYSTOL, color = COBCODBC, group = COBCODBC)) +
  stat_summary(fun = "mean", geom = "point", size = 3) +  # Points for means
  stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC), size = 1) +  # Line connecting means
  labs(title = "Interaction Plot: Smoking Status and SYSTOL by Country of Birth",
       x = "Smoking Status", 
       y = "SYSTOL",
       color = "Country of Birth") +  # Change legend title
  scale_x_discrete(labels = c(
    "0" = "Not applicable",
    "1" = "Current \n smoker daily",
    "2" = "Current \n smoker weekly",
    "3" = "Current \n smoker \n less than weekly",
    "4" = "Ex-smoker",
    "5" = "Never smoked"
  )) +  # Custom x-axis labels for SMKSTAT
  scale_color_manual(values = c("lightblue3", "tomato3", "palegreen4"), labels = labels) +
  theme_bw()

Alcohol

Code
hist(data$TOTPAC)

Code
alcohol_mapping <- c(
  "0" = 0,   # Not applicable
  "1" = 365, # Every day
  "2" = 260, # 5 to 6 days a week (approx. 5.5 * 52)
  "3" = 208, # 3 to 4 days a week (approx. 4 * 52)
  "4" = 104, # 1 to 2 days a week (approx. 2 * 52)
  "5" = 36,  # 2 to 3 days a month (approx. 2.5 * 12)
  "6" = 12,  # 1 day a month
  "7" = 6,   # Less than once a month (approx. 0.5 * 12)
  "8" = NA   # Not known
)

# Apply the mapping to convert ALCUSUQ2 to numeric
data_clean2$ALCUSUQ2_numeric <- as.numeric(alcohol_mapping[as.character(data_clean2$ALCUSUQ2)])
Code
# SYSTOL
data_clean2$TOTPAC_normalise <- as.numeric(data_clean2$TOTPAC_normalise )
data_clean2$ALCWKLY_normalise <- as.numeric(data_clean2$ALCWKLY_normalise )
systol_model2 <- lmer(
  SYSTOL ~  (1 | AGEB) + (1 | SEX) + TOTPAC_normalise * COBCODBC  + ALCWKLY_normalise * COBCODBC  , 
  data = data_clean2
)

diastol_model2 <- lmer(
  DIASTOL ~  (1 | AGEB) + (1 | SEX) + TOTPAC_normalise * COBCODBC  + ALCWKLY_normalise * COBCODBC, 
  data = data_clean2
)
# Summaries
summary(systol_model2)
Linear mixed model fit by REML ['lmerMod']
Formula: SYSTOL ~ (1 | AGEB) + (1 | SEX) + TOTPAC_normalise * COBCODBC +  
    ALCWKLY_normalise * COBCODBC
   Data: data_clean2

REML criterion at convergence: 173334.2

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.9831 -0.6637 -0.1041  0.5583  6.0818 

Random effects:
 Groups   Name        Variance Std.Dev.
 AGEB     (Intercept) 121.16   11.007  
 SEX      (Intercept)   9.22    3.036  
 Residual             308.69   17.570  
Number of obs: 20214, groups:  AGEB, 16; SEX, 2

Fixed effects:
                            Estimate Std. Error t value
(Intercept)                 124.5794     3.4983  35.612
TOTPAC_normalise              0.2467     0.2529   0.975
COBCODBC2                     0.1262     0.5384   0.234
COBCODBC3                     1.2703     0.5452   2.330
ALCWKLY_normalise             0.3676     0.1298   2.832
TOTPAC_normalise:COBCODBC2   -0.5499     0.6075  -0.905
TOTPAC_normalise:COBCODBC3    0.2170     0.8144   0.266
COBCODBC2:ALCWKLY_normalise  -0.1200     0.3282  -0.366
COBCODBC3:ALCWKLY_normalise  -0.3522     0.4351  -0.809

Correlation of Fixed Effects:
                  (Intr) TOTPAC_n COBCODBC2 COBCODBC3 ALCWKL TOTPAC_:COBCODBC2
TOTPAC_nrml       -0.031                                                      
COBCODBC2         -0.021  0.149                                               
COBCODBC3         -0.022  0.153    0.148                                      
ALCWKLY_nrm        0.013 -0.903   -0.042    -0.043                            
TOTPAC_:COBCODBC2  0.011 -0.382   -0.374    -0.065     0.348                  
TOTPAC_:COBCODBC3  0.008 -0.270   -0.051    -0.370     0.246  0.115           
COBCODBC2:A       -0.004  0.329    0.038     0.019    -0.371 -0.883           
COBCODBC3:A       -0.002  0.233    0.016     0.083    -0.266 -0.101           
                  TOTPAC_:COBCODBC3 COBCODBC2:
TOTPAC_nrml                                   
COBCODBC2                                     
COBCODBC3                                     
ALCWKLY_nrm                                   
TOTPAC_:COBCODBC2                             
TOTPAC_:COBCODBC3                             
COBCODBC2:A       -0.100                      
COBCODBC3:A       -0.896             0.109    
Code
summary(diastol_model2)
Linear mixed model fit by REML ['lmerMod']
Formula: DIASTOL ~ (1 | AGEB) + (1 | SEX) + TOTPAC_normalise * COBCODBC +  
    ALCWKLY_normalise * COBCODBC
   Data: data_clean2

REML criterion at convergence: 152993.5

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.2625 -0.6510 -0.0539  0.6046  5.1914 

Random effects:
 Groups   Name        Variance Std.Dev.
 AGEB     (Intercept)  15.1074  3.887  
 SEX      (Intercept)   0.1998  0.447  
 Residual             112.9066 10.626  
Number of obs: 20214, groups:  AGEB, 16; SEX, 2

Fixed effects:
                            Estimate Std. Error t value
(Intercept)                 75.16128    1.03168  72.853
TOTPAC_normalise            -0.35012    0.15289  -2.290
COBCODBC2                   -0.08805    0.32561  -0.270
COBCODBC3                    1.24535    0.32970   3.777
ALCWKLY_normalise            0.42307    0.07847   5.392
TOTPAC_normalise:COBCODBC2   0.28618    0.36741   0.779
TOTPAC_normalise:COBCODBC3  -0.13640    0.49250  -0.277
COBCODBC2:ALCWKLY_normalise -0.28393    0.19848  -1.431
COBCODBC3:ALCWKLY_normalise -0.17536    0.26314  -0.666

Correlation of Fixed Effects:
                  (Intr) TOTPAC_n COBCODBC2 COBCODBC3 ALCWKL TOTPAC_:COBCODBC2
TOTPAC_nrml       -0.063                                                      
COBCODBC2         -0.044  0.149                                               
COBCODBC3         -0.045  0.153    0.148                                      
ALCWKLY_nrm        0.027 -0.903   -0.042    -0.043                            
TOTPAC_:COBCODBC2  0.023 -0.382   -0.374    -0.065     0.348                  
TOTPAC_:COBCODBC3  0.015 -0.271   -0.051    -0.370     0.246  0.115           
COBCODBC2:A       -0.008  0.329    0.038     0.019    -0.371 -0.883           
COBCODBC3:A       -0.005  0.233    0.016     0.083    -0.266 -0.101           
                  TOTPAC_:COBCODBC3 COBCODBC2:
TOTPAC_nrml                                   
COBCODBC2                                     
COBCODBC3                                     
ALCWKLY_nrm                                   
TOTPAC_:COBCODBC2                             
TOTPAC_:COBCODBC3                             
COBCODBC2:A       -0.100                      
COBCODBC3:A       -0.896             0.109    

Exercise

Code
data_clean3 <- data %>%
  filter(!is.na(SYSTOL) & !is.na(DIASTOL) & !is.na(EXREGUIC) & !is.na(COBCODBC)  & 
         !is.na(AGEB) & !is.na(SEX)  & !is.na(EXREGUID)  & !is.na(EXFSRMNE) & !is.na(EXLWMBC) & !is.na(EXLWVBC)) %>% filter(EXREGUID %in% c(1,2)) %>% filter(EXREGUIC %in% c(1,2))

table(data_clean3$EXREGUID)

    0     1     2     9 
    0 14655 21821     0 
Code
hist(data_clean3$EXLWMBC)

Code
systol_model3 <- lmer(
  SYSTOL ~ EXREGUIC * COBCODBC + (1 | AGEB) + (1 | SEX)  + 
  EXREGUID * COBCODBC + EXFSRMNE * COBCODBC + EXLWMBC * COBCODBC + EXLWVBC * COBCODBC, 
  data = data_clean3
)
diastol_model3 <- lmer(
  DIASTOL ~ EXREGUIC * COBCODBC + (1 | AGEB) + (1 | SEX)  + 
  EXREGUID * COBCODBC + EXFSRMNE * COBCODBC + EXLWMBC * COBCODBC + EXLWVBC * COBCODBC, 
  data = data_clean3
)

# Summaries
summary(systol_model3)
Linear mixed model fit by REML ['lmerMod']
Formula: SYSTOL ~ EXREGUIC * COBCODBC + (1 | AGEB) + (1 | SEX) + EXREGUID *  
    COBCODBC + EXFSRMNE * COBCODBC + EXLWMBC * COBCODBC + EXLWVBC *  
    COBCODBC
   Data: data_clean3

REML criterion at convergence: 315700.4

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.8484 -0.6688 -0.0926  0.5613  5.8003 

Random effects:
 Groups   Name        Variance Std.Dev.
 AGEB     (Intercept) 120.429  10.974  
 SEX      (Intercept)   5.555   2.357  
 Residual             334.371  18.286  
Number of obs: 36476, groups:  AGEB, 15; SEX, 2

Fixed effects:
                      Estimate Std. Error t value
(Intercept)          1.255e+02  3.299e+00  38.047
EXREGUIC2           -6.408e-01  4.990e-01  -1.284
COBCODBC2           -2.250e+00  6.669e-01  -3.374
COBCODBC3            1.031e+00  6.467e-01   1.594
EXREGUID2            1.728e+00  4.987e-01   3.464
EXFSRMNE             7.816e-04  8.820e-04   0.886
EXLWMBC              2.040e-03  1.072e-03   1.902
EXLWVBC              1.562e-03  1.193e-03   1.310
EXREGUIC2:COBCODBC2 -1.296e+00  1.170e+00  -1.108
EXREGUIC2:COBCODBC3 -1.186e-01  1.147e+00  -0.103
COBCODBC2:EXREGUID2  3.504e+00  1.176e+00   2.979
COBCODBC3:EXREGUID2  2.080e-01  1.155e+00   0.180
COBCODBC2:EXFSRMNE   8.262e-04  2.017e-03   0.410
COBCODBC3:EXFSRMNE   3.070e-03  2.084e-03   1.473
COBCODBC2:EXLWMBC   -4.331e-03  2.312e-03  -1.873
COBCODBC3:EXLWMBC   -2.654e-03  2.811e-03  -0.944
COBCODBC2:EXLWVBC   -5.127e-03  3.172e-03  -1.616
COBCODBC3:EXLWVBC   -7.331e-03  3.433e-03  -2.135
Code
summary(diastol_model3)
Linear mixed model fit by REML ['lmerMod']
Formula: DIASTOL ~ EXREGUIC * COBCODBC + (1 | AGEB) + (1 | SEX) + EXREGUID *  
    COBCODBC + EXFSRMNE * COBCODBC + EXLWMBC * COBCODBC + EXLWVBC *  
    COBCODBC
   Data: data_clean3

REML criterion at convergence: 277745.9

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.6512 -0.6636 -0.0400  0.6210  5.3538 

Random effects:
 Groups   Name        Variance Std.Dev.
 AGEB     (Intercept)  11.8821  3.4470 
 SEX      (Intercept)   0.2909  0.5393 
 Residual             118.1252 10.8685 
Number of obs: 36476, groups:  AGEB, 15; SEX, 2

Fixed effects:
                      Estimate Std. Error t value
(Intercept)         75.7301093  0.9820006  77.118
EXREGUIC2            0.8049048  0.2965629   2.714
COBCODBC2           -0.4961945  0.3963551  -1.252
COBCODBC3            0.8276915  0.3843725   2.153
EXREGUID2           -0.1939415  0.2964021  -0.654
EXFSRMNE            -0.0002955  0.0005242  -0.564
EXLWMBC              0.0004067  0.0006373   0.638
EXLWVBC             -0.0048991  0.0007088  -6.912
EXREGUIC2:COBCODBC2 -2.7735352  0.6951454  -3.990
EXREGUIC2:COBCODBC3  1.7622351  0.6814629   2.586
COBCODBC2:EXREGUID2  2.6212556  0.6991956   3.749
COBCODBC3:EXREGUID2 -1.2073901  0.6863879  -1.759
COBCODBC2:EXFSRMNE  -0.0012441  0.0011986  -1.038
COBCODBC3:EXFSRMNE  -0.0008997  0.0012386  -0.726
COBCODBC2:EXLWMBC   -0.0013247  0.0013745  -0.964
COBCODBC3:EXLWMBC    0.0022148  0.0016707   1.326
COBCODBC2:EXLWVBC   -0.0080101  0.0018851  -4.249
COBCODBC3:EXLWVBC   -0.0018109  0.0020406  -0.887
Code
# Define labels for COBCODBC
labels <- c("Australians", "Main English Speaking Countries", "Other")

# Create the plot with relabeled legend
ggplot(data_clean3, aes(x = EXREGUID, y = SYSTOL, color = COBCODBC, group = COBCODBC)) +
  stat_summary(fun = "mean", geom = "point", size = 3) +  # Points for means
  stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC), size = 1) +  # Line connecting means
  labs(title = "Interaction Plot: Exercise Guidelines and SYSTOL by Country of Birth",
       x = "Exercise Guidelines", 
       y = "SYSTOL",
       color = "Country of Birth") +  # Change legend title
  scale_color_manual(values = c("lightblue3", "tomato3", "palegreen4"), labels = labels) +
  theme_bw()

Code
labels <- c("Australians", "Main English Speaking Countries", "Other")
ggplot(data_clean3, aes(x = EXLWVBC, y = SYSTOL, color = COBCODBC)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "lm", se = FALSE) +  # Add regression lines for each group
  labs(title = "Interaction Between Minutes of Vigorous Exercise and SYSTOL by Country of Birth",
       x = "Minutes of Vigorous Exercise",
       y = "SYSTOL") + facet_wrap(~ COBCODBC, scales = "free_y") +
  theme_bw()+  # Change legend title
  scale_color_manual(values = c("lightblue4", "tomato3", "palegreen4"), labels = labels) +
  theme_bw()

Code
ggplot(data_clean3, aes(x = EXREGUID, y = SYSTOL, color = COBCODBC, group = COBCODBC)) +
  stat_summary(fun = "mean", geom = "point", size = 3) +  # Points for means
  stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC), size = 1) +  # Line connecting means
  labs(title = "Interaction Plot: Exercise Guidelines and SYSTOL by Country of Birth",
       x = "Exercise Guidelines", 
       y = "SYSTOL",
       color = "Country of Birth") +  # Change legend title
  scale_x_discrete(labels = c("1" = "Met Guidelines", "2" = "Did Not Meet Guidelines")) +  # Custom x-axis labels for discrete scale
  scale_color_manual(values = c("lightblue3", "tomato3", "palegreen4"), labels = labels) +
  theme_bw()